*global path = "/Users/jhainmueller/Dropbox/VoteByMail/"
global path = "/gl/Dropbox/VoteByMail/"
*C:\gl\Dropbox\VoteByMail\DataAnalysis\code
cd "$path/DataAnalysis/data/"


capture program drop norm
program define norm
 summarize  `1'
 replace `1' = (`1'-r(min))/(r(max)-r(min))
 summarize  `1'
end


use "vbm_data09302014.dta", clear
tab year office
* trim to races with only one cand
gen  ones = 1
egen howmany = sum(ones), by(raceID general)
tabulate howmany general
drop if howmany==1
tab year office

*keep general election candidates 
keep if general == 1
sort district office year

replace  educdegree = "MD" if educdegree=="DDS"
encode educdegree, gen(educdegree1)
encode firstparty, gen(firstparty1)

egen demreptoptwo = rowmax(Republican_Rank Democrat_Rank)

*error with vote share 
*Greg Smith vote share in 2008 (candidate 195) is wrong (30 percentage points below other years)
replace voteshare = 69.48 if candID == 195 & year == 2008
replace voteshare = 30.25 if candID == 237 & year == 2008
replace voteshare2p = 69.48/(69.48+30.25) if candID == 195 & year == 2008
replace voteshare2p = 30.25/(69.48+30.25) if candID == 237 & year == 2008
*another obvious error-corrected based  ballotpedia.
replace voteshare = 35847/(16850 +35847) if candID == 341 & year == 2008
replace voteshare = 16850/(16850 +35847) if candID == 325 & year == 2008
replace voteshare2p = 35847/(16850 +35847) if candID == 341 & year == 2008
replace voteshare2p = 16850/(16850 +35847) if candID == 325 & year == 2008
replace voteshare2p = 60.5/(60.5+36.8) if candID == 341 & year == 2012
replace voteshare2p = 36.8/(60.5+36.8) if candID == 178 & year == 2012
list votes*  if candID == 341 & year == 2008
list votes*  if candID == 341 & year == 2012
list votes*  if candID == 325 & year == 2008
/*276 John Lim - looks right, though I haven't confirmed exact numbers: 
In 2004, Lim was elected State Representative of District 50. He was re-elected to that office in 2006, serving in the 73rd Oregon Legislative Assembly and the 74th Oregon Legislative Assembly.
John Lim lost his bid for re-election to the Oregon House in the 2008 general election. Former police officer Greg Matthews, a Democrat, took over representation of Oregon's 50th District in January 2009.
*/


* open seats
egen howmanyinc = sum(incumbent), by(raceID) 
tab howmanyinc
gen openseat = (howmanyinc==0)
drop howmanyinc

*merge appearance ratings
*merge list of picture IDs candidates with repeat pictures
*candidate ID mistake
  recode candID 271 = 272
merge 1:1 candID year using candidate_list_CORRECTED_1.29.14.dta
drop _merge
rename picture_id picture_ID
merge m:m picture_ID year using appearance_ratings,gen(pictures_merge)
merge m:m picture_ID year using appearance_ratings,gen(pictures_merge2) update
drop _merge
*cannot get it to merge repeat pictures - strange

sum  fem_- white
 *workaround:
  for var fem_- white: replace X =X[_n-1] if X==. & X[_n-1]!=. &picture_ID == picture_ID[_n-1]
*saveold vote_appearance, replace
sum  fem_- white

*dataset for kelsey to get 2014 pictures
append using vbm2014
g fn = word(name,1) if wordcount(name) ==2
replace fn = word(name,1)+word(name,2) if wordcount(name) ==3
replace fn = word(name,1)+word(name,2)+word(name,3) if wordcount(name) ==4
replace fn = word(name,1)+word(name,2)+word(name,3) +word(name,4) if wordcount(name) ==5
g ln = word(name,wordcount(name))
replace ln = word(name,wordcount(name)-1) if ln=="Jr"
g wc=wordcount(name)
tab wc
tab name if wc==5
replace ln =  "Nortness" if ln== "CNortness"
tab fn
tab ln
order name fn ln wc 
replace candlastname=ln if year==2014
replace candfirstname=fn if year==2014
 *fix
 replace candfirstname="Caddy Mc" if candfirstname=="CaddyMc"
 replace candfirstname="Elizabeth Steiner" if candfirstname=="ElizabethSteiner"
 replace candfirstname="Sara A." if candfirstname=="SaraA"
 replace candfirstname="Richmond" if candfirstname=="Rich" & candlastname=="Harisay" 
 replace candfirstname="Timothy E Mc" if candfirstname=="TimothyEMc"
 replace candfirstname="Alan R" if candfirstname=="AlanR"
 replace candfirstname="Michael P" if candfirstname=="MichaelP"
 replace candfirstname="Robert R" if candfirstname=="RobertR"
 replace candfirstname="Knute C" if candfirstname=="KnuteC"
 replace candfirstname="Betsy L" if candfirstname=="BetsyL"
 replace candfirstname="Laura D" if candfirstname=="LauraD"
 replace candfirstname="Larry C" if candfirstname=="LarryC" 
 replace candfirstname="Christopher P" if candfirstname=="ChristopherP"
 replace candfirstname="Kathy B" if candfirstname=="KathyB"
 replace candfirstname="Jodi L" if candfirstname=="JodiL"
 replace candfirstname="Scott A" if candfirstname=="ScottA"
 replace candfirstname="Carla C" if candfirstname=="CarlaC"
 replace candfirstname="Eric D" if candfirstname=="EricD"
 
 *assign candID
    sort  candlastname candfirstname year  office district
    for var candID: replace X = X[_n-1] if X ==. & X[_n-1]! =. & candfirstname == candfirstname[_n-1] & candlastname == candlastname[_n-1]
    *for var candID: replace X = X[_n-1] if X ==. & X[_n-1]! =. & picture_ID == picture_ID[_n-1]

sort  candlastname candfirstname year  office district
order office year district candlastname candfirstname candID


***Merge finance data***
merge m:1 party office distric year using receiptsdata_formerge.dta
drop if _merge==2

*544

* check if names match
split FTM_candidate, p(", ")
rename FTM_candidate1 FTM_candlastname
rename FTM_candidate2 FTM_candfirstname
replace FTM_candlastname = proper(lower(FTM_candlastname))
list candlastname FTM_candlastname
gen test2 = candlastname == FTM_candlastname
tab test2
* most match perfectly
list FTM_cand* candlastname candfirstname if test2==0
* rest matches also but slighly different spelling

* there are a few cases where their and our incumbency var differ
* need to check
tab  incumbent FTM_incumbency_status

* fix buckley he did win in 2010
replace incumbent=1 if candlastname=="Buckley" & candfirstname=="Peter" & year==2012
* fix Holye she was appointed in 2009
replace incumbent=1 if candlastname=="Hoyle" & candfirstname=="Val" & year==2010
* fix Coon she was appointed in 2008
replace incumbent=1 if candlastname=="Coon" & candfirstname=="Dwight" & year==2010

* fix Galizio 2004 was his first race
replace incumbent=0 if candlastname=="Galizio" & candfirstname=="Larry" & year==2004
* fix Roblan in 2012 he first ran for the senate and at the time was a house member
replace incumbent=0 if candlastname=="Roblan" & candfirstname=="Arnie" & year==2012
* Deckert is miscoded in the FTM data
* fix Avakian in 2006 he first ran for the senate and at the time was a house member
replace incumbent=0 if candlastname=="Avakian" & candfirstname=="Brad" & year==2006
* Hayward is miscoded in FTM data
* Burdick  is miscoded in FTM data
* Brewer is miscoded in the FTM data
*list party office distric year candlastname candfirstname if incumbent==1 & FTM_incumbency_status=="Open"

*Merge in Kelsey 2004 fixes plus other years
drop _merge
duplicates list candlastname candfirstname year 
merge 1:1 candlastname candfirstname year using kelsey2004_other_fixes.dta, update replace
order office year district candlastname candfirstname candID picture_ID
sort candlastname candfirstname year office  district candID picture_ID
drop _merge
keep if party=="Republican" | party=="Democrat" 
replace district = 34 if  candlastname== "Avakian" & year==2004

*Merge in Kelsey 2014 picture variables
  *https://docs.google.com/spreadsheets/d/1lUp5APhVDzUZ-qnzsV_-RIdNINeIFC5fHCz6TWz-Z14/edit?usp=sharing
merge 1:1 candlastname candfirstname year using list_pictures_2014.dta, update replace
list candlastname candfirstname if _merge == 2
*sort  year  candlastname candfirstname 
*edit  year  candlastname candfirstname  if year == 2014
replace picture_ID =  "2035_14" if picture_ID == "2035_14.jpg"
* trim to races with only one cand
replace  ones = 1
egen howmany2 = sum(ones), by(district year office)
tabulate howmany2 
sort  year  office  district party candlastname candfirstname howmany2
order  year  office  district party candlastname candfirstname  howmany2
drop if howmany2==1
tab year office, missing
tab year district, missing
/*
 *export list of 2004 pictures for rating
  drop if comp_!=.
  keep if repeated_pic == 0 & year== 2004 
  drop if picture_ID == "no photo submitted"
  g picture = "'" + picture_ID + ".jpg',"
*/  

*Merge missing 2004 appearance ratings and all 2014 appearance ratings
  *To start collecting March 18, 2015



  
  





*Senator indicato
g Senator = office == "State Senator"


*Republican indicator variable (before tsset)
g GOP = party == "Republican"
g Democratic = party == "Democrat"

*tsset (trick to code variables below in terms of Democrat advantage)
g district_office = district*1000000 +Senator*100
g district_office_year= district*1000000 +year
g year_GOP = year*100+ GOP
duplicates list district_office year_GOP
duplicates tag district_office year_GOP,g(duplicate)
sort  district_office year_GOP
tsset district_office year_GOP

*code receipts
*kdensity FTM_total_
g receipts = log( FTM_total_+1)
g receipts_opponent =l.receipts
replace receipts_opponent =f.receipts if receipts_opponent ==.
g receipts_D = receipts if GOP == 0
replace receipts_D = receipts*-1 if GOP == 1
g receipts_o_D = receipts_opponent if GOP == 0
replace receipts_o_D = receipts_opponent*-1 if GOP == 1


*gender
g female =round(gen_)== 2
tabulate female
g female_opponent = l.female== 1 | f.female == 1
tabulate female female_opponent,mis
g matched_gender =  female==female_opponent
tabulate matched_gender 
tabulate  female female_opponent, sum(matched_gender)
  *tabulate  occupationrec female,col
*White
tabulate white

*count multiple appearances by the same candidate
egen candidate_same_district = sum(ones), by(candID district office)
egen candidate_same= sum(ones), by(candID  )


*vote
g  vote_opponent =l.voteshare2p
replace vote_opponent =f.voteshare2p if vote_opponent ==.
g vote_D = voteshare2p- vote_opponent if GOP == 0
replace vote_D = vote_opponent-voteshare2p if GOP == 1
*competence
rename comp_ competence
norm competence
replace competence = 1 - competence
g competence_opponent =l.competence
replace competence_opponent =f.competence if competence_opponent ==.
g competence_D = competence if GOP == 0
replace competence_D = competence*-1 if GOP == 1
g competence_o_D = competence_opponent if GOP == 0
replace competence_o_D = competence_opponent*-1 if GOP == 1
*incumbent
g incumbent_opponent =l.incumbent
replace incumbent_opponent =f.incumbent if incumbent_opponent ==.
g incumbent_D = incumbent if GOP == 0
replace incumbent_D = incumbent*-1 if GOP == 1
g incumbent_o_D = incumbent_opponent if GOP == 0
replace incumbent_o_D = incumbent_opponent*-1 if GOP == 1

*femininity
*scatter fem_ fem_sd
*scatter fem_ female
g fem_opponent =l.fem_
replace fem_opponent =f.fem_ if fem_opponent ==.
g fem_D = fem_ if GOP == 0
replace fem_D = fem_*-1 if GOP == 1
g fem_o_D = fem_opponent if GOP == 0
replace fem_o_D = fem_opponent*-1 if GOP == 1
g fem_iv=1/fem_sd^2


*fix incorrect candidate ID
replace candID =1028 if candID ==1020 & candlastname == "Cairns"

duplicates list   candID year
sort   candID year
tsset  candID year
duplicates list  district_office year
g competence_D_change = competence_D -l2.competence_D
egen change_max =max(abs(competence_D_change) ), by(candID)
g lagg_incumbent =l2.incumbent
g candidate_district_office = candID+district_office
order district office year district_office name candID raceID GOP incumbent lagg_incumbent vote_D competence*D  competence_opponent competence voteshare2p
saveold voteapp_full,replace

*main regression
regress  vote_D  i.candID i.year competence_D  competence_o_D  if candidate_same_district >1 
  	outreg2 using Table, word se dec(2) keep(competence_D  competence_o_D) ctitle(Cand. Same DYFE) append
	